Los datos están relacionados con campañas de marketing directo (llamadas telefónicas) de una institución bancaria portuguesa. El objetivo de la clasificación es predecir si el cliente suscribirá un depósito a plazo (variable y). Los datos están relacionados con campañas de marketing directo de una institución bancaria portuguesa. A menudo, se requería más de un contacto con el mismo cliente, para poder acceder si el producto (depósito bancario a plazo) estaría ('sí') o no ('no') suscrito. Por tanto, el objetivo de la clasificación es predecir si el cliente suscribirá (sí/no) un depósito a plazo (variable y).
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
sns.set_theme(color_codes=True)
import warnings
warnings.filterwarnings("ignore")
df = pd.read_csv("bank-additional-full.csv", delimiter=";")
pd.set_option("display.max_columns", None)
df.head()
| age | job | marital | education | default | housing | loan | contact | month | day_of_week | duration | campaign | pdays | previous | poutcome | emp.var.rate | cons.price.idx | cons.conf.idx | euribor3m | nr.employed | y | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 56 | housemaid | married | basic.4y | no | no | no | telephone | may | mon | 261 | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no |
| 1 | 57 | services | married | high.school | unknown | no | no | telephone | may | mon | 149 | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no |
| 2 | 37 | services | married | high.school | no | yes | no | telephone | may | mon | 226 | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no |
| 3 | 40 | admin. | married | basic.6y | no | no | no | telephone | may | mon | 151 | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no |
| 4 | 56 | services | married | high.school | no | no | yes | telephone | may | mon | 307 | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 41188 entries, 0 to 41187 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 41188 non-null int64 1 job 41188 non-null object 2 marital 41188 non-null object 3 education 41188 non-null object 4 default 41188 non-null object 5 housing 41188 non-null object 6 loan 41188 non-null object 7 contact 41188 non-null object 8 month 41188 non-null object 9 day_of_week 41188 non-null object 10 duration 41188 non-null int64 11 campaign 41188 non-null int64 12 pdays 41188 non-null int64 13 previous 41188 non-null int64 14 poutcome 41188 non-null object 15 emp.var.rate 41188 non-null float64 16 cons.price.idx 41188 non-null float64 17 cons.conf.idx 41188 non-null float64 18 euribor3m 41188 non-null float64 19 nr.employed 41188 non-null float64 20 y 41188 non-null object dtypes: float64(5), int64(5), object(11) memory usage: 6.6+ MB
df.isnull().sum()
age 0 job 0 marital 0 education 0 default 0 housing 0 loan 0 contact 0 month 0 day_of_week 0 duration 0 campaign 0 pdays 0 previous 0 poutcome 0 emp.var.rate 0 cons.price.idx 0 cons.conf.idx 0 euribor3m 0 nr.employed 0 y 0 dtype: int64
#Seleccionar datos categoricos
df_categoricos = df[["job", "marital", "education", "default", "housing", "loan", "contact", "month", "day_of_week",
"poutcome", "y"]]
df_categoricos.head()
| job | marital | education | default | housing | loan | contact | month | day_of_week | poutcome | y | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | housemaid | married | basic.4y | no | no | no | telephone | may | mon | nonexistent | no |
| 1 | services | married | high.school | unknown | no | no | telephone | may | mon | nonexistent | no |
| 2 | services | married | high.school | no | yes | no | telephone | may | mon | nonexistent | no |
| 3 | admin. | married | basic.6y | no | no | no | telephone | may | mon | nonexistent | no |
| 4 | services | married | high.school | no | no | yes | telephone | may | mon | nonexistent | no |
#Seleccionar datos númericos
df_numericos = df[["age", "duration", "campaign", "pdays", "previous", "emp.var.rate", "cons.price.idx",
"cons.conf.idx", "euribor3m", "nr.employed"]]
df_numericos.head()
| age | duration | campaign | pdays | previous | emp.var.rate | cons.price.idx | cons.conf.idx | euribor3m | nr.employed | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 56 | 261 | 1 | 999 | 0 | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 |
| 1 | 57 | 149 | 1 | 999 | 0 | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 |
| 2 | 37 | 226 | 1 | 999 | 0 | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 |
| 3 | 40 | 151 | 1 | 999 | 0 | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 |
| 4 | 56 | 307 | 1 | 999 | 0 | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 |
# Documentar sns.countplot: https://seaborn.pydata.org/generated/seaborn.countplot.html
cat_vars = ["job", "marital", "education", "default", "housing", "loan", "contact", "month", "day_of_week", "poutcome"]
#Crear figuras con subplots
fig, axs = plt.subplots(nrows=2, ncols=5, figsize = (20, 10))
axs = axs.flatten()
#Crear un countplot para cada variable categorica
for i, var in enumerate (cat_vars):
sns.countplot(x=var, hue="y", data = df_categoricos, ax=axs[i])
axs[i].set_xticklabels(axs[i].get_xticklabels(), rotation = 90)
#Ajustar espacio entre subplots
fig.tight_layout()
#Mostrar el plot
plt.show()
# Documentar seaborn.histplot: https://seaborn.pydata.org/generated/seaborn.histplot.html
#Lista de variables categoricas
cat_vars = ["job", "marital", "education", "default", "housing", "loan", "contact", "month", "day_of_week", "poutcome"]
#Crear figuras con subplots
fig, axs = plt.subplots(nrows=2, ncols=5, figsize=(20, 10))
axs = axs.flatten()
#Crear histogramas para cada variable categorica
for i, var in enumerate (cat_vars):
sns.histplot(x=var, hue="y", data = df_categoricos, ax=axs[i], multiple = "fill", kde = False, element = "bars", fill= True, stat = "density")
axs[i].set_xticklabels(df_categoricos[var].unique(), rotation=90)
axs[i].set_xlabel(var)
#Ajustar el especio entre subplots
fig.tight_layout()
#Mostrar el plot
plt.show()
La mayoría de las personas que suscriben los depósitos bancarios a plazo son: jubilados y estudiantes.
La mayoría de las personas que suscriben los depósitos bancarios a plazo son cantactados por vía celular.
La mayoría de las personas que suscriben los depósitos bancarios a plazo tienen su último contacto en: octubre, diciembre, marzo, septiembre
La mayoría de las personas que suscriben el depósito bancario a plazo han valorado exitosamente la campaña de marketing.
# Documentar sns.boxplot: https://seaborn.pydata.org/generated/seaborn.boxplot.html
num_vars = ["age", "duration", "campaign", "pdays", "previous", "emp.var.rate", "cons.price.idx",
"cons.conf.idx", "euribor3m", "nr.employed"]
fig, axs = plt.subplots(nrows=2, ncols=5, figsize=(20, 10))
axs = axs.flatten()
for i, var in enumerate(num_vars):
sns.boxplot(x=var, data=df, ax=axs[i])
fig.tight_layout()
plt.show()
# Documentar sns.violinplot: https://seaborn.pydata.org/generated/seaborn.violinplot.html
num_vars = ["age", "duration", "campaign", "pdays", "previous", "emp.var.rate", "cons.price.idx",
"cons.conf.idx", "euribor3m", "nr.employed"]
fig, axs = plt.subplots(nrows=2, ncols=5, figsize=(20, 10))
axs = axs.flatten()
for i, var in enumerate(num_vars):
sns.violinplot(x=var, data=df, ax=axs[i])
fig.tight_layout()
plt.show()
num_vars = ["age", "duration", "campaign", "pdays", "previous", "emp.var.rate", "cons.price.idx",
"cons.conf.idx", "euribor3m", "nr.employed"]
fig, axs = plt.subplots(nrows=2, ncols=5, figsize=(20, 10))
axs = axs.flatten()
for i, var in enumerate(num_vars):
sns.violinplot(x=var, y="y", data=df, ax=axs[i])
fig.tight_layout()
plt.show()
num_vars = ["age", "duration", "campaign", "pdays", "previous", "emp.var.rate", "cons.price.idx",
"cons.conf.idx", "euribor3m", "nr.employed"]
fig, axs = plt.subplots(nrows=2, ncols=5, figsize=(20, 10))
axs = axs.flatten()
for i, var in enumerate(num_vars):
sns.histplot(x=var, data=df, ax=axs[i])
fig.tight_layout()
plt.show()
num_vars = ['age', 'duration', 'campaign', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx',
'cons.conf.idx', 'euribor3m', 'nr.employed']
fig, axs = plt.subplots(nrows=2, ncols=5, figsize=(20, 10))
axs = axs.flatten()
for i, var in enumerate(num_vars):
sns.histplot(x=var, hue='y', data=df, ax=axs[i], multiple="stack")
fig.tight_layout()
plt.show()
# Documentar sns.pairplot: https://seaborn.pydata.org/generated/seaborn.pairplot.html
# Listado de variables númericas.
num_vars = ['age', 'duration', 'campaign', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx',
'cons.conf.idx', 'euribor3m', 'nr.employed']
# Crear una Matriz para los diagramas de dispersión
sns.pairplot(df, hue='y')
<seaborn.axisgrid.PairGrid at 0x22e689ba220>
Devuelve valores únicos de una serie de objetos.
Los valor únicos se devuelven en orden de aparición. Los valores Únicos se basan en tablas hash, por lo tanto, NO se ordenan.
https://pandas.pydata.org/docs/reference/api/pandas.Series.unique.html
df['job'].unique()
array(['housemaid', 'services', 'admin.', 'blue-collar', 'technician',
'retired', 'management', 'unemployed', 'self-employed', 'unknown',
'entrepreneur', 'student'], dtype=object)
df['marital'].unique()
array(['married', 'single', 'divorced', 'unknown'], dtype=object)
df['education'].unique()
array(['basic.4y', 'high.school', 'basic.6y', 'basic.9y',
'professional.course', 'unknown', 'university.degree',
'illiterate'], dtype=object)
df['default'].unique()
array(['no', 'unknown', 'yes'], dtype=object)
df['housing'].unique()
array(['no', 'yes', 'unknown'], dtype=object)
df['loan'].unique()
array(['no', 'yes', 'unknown'], dtype=object)
df['contact'].unique()
array(['telephone', 'cellular'], dtype=object)
df['month'].unique()
array(['may', 'jun', 'jul', 'aug', 'oct', 'nov', 'dec', 'mar', 'apr',
'sep'], dtype=object)
df['day_of_week'].unique()
array(['mon', 'tue', 'wed', 'thu', 'fri'], dtype=object)
df['poutcome'].unique()
array(['nonexistent', 'failure', 'success'], dtype=object)
df['y'].unique()
array(['no', 'yes'], dtype=object)
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
df['job']= label_encoder.fit_transform(df['job'])
df['job'].unique()
array([ 3, 7, 0, 1, 9, 5, 4, 10, 6, 11, 2, 8])
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
df['marital']= label_encoder.fit_transform(df['marital'])
df['marital'].unique()
array([1, 2, 0, 3])
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
df['education']= label_encoder.fit_transform(df['education'])
df['education'].unique()
array([0, 3, 1, 2, 5, 7, 6, 4])
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
df['default']= label_encoder.fit_transform(df['default'])
df['default'].unique()
array([0, 1, 2])
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
df['housing']= label_encoder.fit_transform(df['housing'])
df['housing'].unique()
array([0, 2, 1])
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
df['loan']= label_encoder.fit_transform(df['loan'])
df['loan'].unique()
array([0, 2, 1])
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
df['contact']= label_encoder.fit_transform(df['contact'])
df['contact'].unique()
array([1, 0])
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
df['month']= label_encoder.fit_transform(df['month'])
df['month'].unique()
array([6, 4, 3, 1, 8, 7, 2, 5, 0, 9])
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
df['day_of_week']= label_encoder.fit_transform(df['day_of_week'])
df['day_of_week'].unique()
array([1, 3, 4, 2, 0])
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
df['poutcome']= label_encoder.fit_transform(df['poutcome'])
df['poutcome'].unique()
array([1, 0, 2])
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
df['y']= label_encoder.fit_transform(df['y'])
df['y'].unique()
array([0, 1])
df.head()
| age | job | marital | education | default | housing | loan | contact | month | day_of_week | duration | campaign | pdays | previous | poutcome | emp.var.rate | cons.price.idx | cons.conf.idx | euribor3m | nr.employed | y | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 56 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 6 | 1 | 261 | 1 | 999 | 0 | 1 | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | 0 |
| 1 | 57 | 7 | 1 | 3 | 1 | 0 | 0 | 1 | 6 | 1 | 149 | 1 | 999 | 0 | 1 | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | 0 |
| 2 | 37 | 7 | 1 | 3 | 0 | 2 | 0 | 1 | 6 | 1 | 226 | 1 | 999 | 0 | 1 | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | 0 |
| 3 | 40 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 6 | 1 | 151 | 1 | 999 | 0 | 1 | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | 0 |
| 4 | 56 | 7 | 1 | 3 | 0 | 0 | 2 | 1 | 6 | 1 | 307 | 1 | 999 | 0 | 1 | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | 0 |
"Y" Label
sns.countplot(df['y'])
df['y'].value_counts()
0 36548 1 4640 Name: y, dtype: int64
from sklearn.utils import resample
#Crear dos diferentes dataframe de una clase mayoritaria y minoritaria
df_majority = df[(df['y']==0)]
df_minority = df[(df['y']==1)]
# muestreo ascendente de la clase minoritaria
df_minority_upsampled = resample(df_minority,
replace=True, # muesta con reemplazo
n_samples= 36548, # para que coincida con la clase mayoritaria
random_state=0) # resultados reproducible
# Combinar la clase mayoritaria con la muestra ascendente de la clase minoritaria
df_upsampled = pd.concat([df_minority_upsampled, df_majority])
sns.countplot(df_upsampled['y'])
df_upsampled['y'].value_counts()
1 36548 0 36548 Name: y, dtype: int64
Detectar outlier es tedioso, especialmente cuando se tienen multiples tipos de datos.
Por lo tanto, tenemos diferentes formas de detectar valores atípicos para diferentes tipos de datos.
En cuanto a los datos distribuidos normalmente, podemos obtener el método Z-Score;
Para skewed data, se usa IQR.
def remove_outliers_iqr(df, columns):
for col in columns:
q1 = df[col].quantile(0.25)
q3 = df[col].quantile(0.75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
return df
# Señale las columnas para remover los outliers
columns_to_check = ['age', 'duration', 'campaign', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx',
'euribor3m', 'nr.employed']
# Solicitar la función que remueve los outliers usando IQR
df_clean = remove_outliers_iqr(df_upsampled, columns_to_check)
# Mostrar el resultado en el dataframe
df_clean.head()
| age | job | marital | education | default | housing | loan | contact | month | day_of_week | duration | campaign | pdays | previous | poutcome | emp.var.rate | cons.price.idx | cons.conf.idx | euribor3m | nr.employed | y | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 37017 | 25 | 8 | 2 | 7 | 1 | 2 | 0 | 0 | 3 | 3 | 371 | 1 | 999 | 0 | 1 | -2.9 | 92.469 | -33.6 | 1.044 | 5076.2 | 1 |
| 36682 | 51 | 9 | 2 | 6 | 0 | 0 | 0 | 0 | 4 | 0 | 657 | 1 | 999 | 0 | 1 | -2.9 | 92.963 | -40.8 | 1.268 | 5076.2 | 1 |
| 29384 | 45 | 7 | 2 | 7 | 0 | 0 | 0 | 1 | 0 | 0 | 541 | 1 | 999 | 0 | 1 | -1.8 | 93.075 | -47.1 | 1.405 | 5099.1 | 1 |
| 21998 | 29 | 9 | 2 | 3 | 1 | 0 | 0 | 0 | 1 | 4 | 921 | 3 | 999 | 0 | 1 | 1.4 | 93.444 | -36.1 | 4.964 | 5228.1 | 1 |
| 16451 | 37 | 10 | 2 | 2 | 1 | 2 | 2 | 0 | 3 | 4 | 633 | 1 | 999 | 0 | 1 | 1.4 | 93.918 | -42.7 | 4.963 | 5228.1 | 1 |
df_clean.shape
(49702, 21)
Seaborn es una biblioteca de python que permite hacer mejores gráficos fácilmente gracias a su función heatmap(). Un mapa de calor es una representación gráfica de datos donde cada valor de una matriz se representa como un color.
plt.figure(figsize=(20, 16))
sns.heatmap(df_clean.corr(), fmt='.2g', annot=True)
<AxesSubplot:>
X = df_clean.drop('y', axis=1)
y = df_clean['y']
https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html
Para ser precisos, el método split() genera los índices de entrenamiento y prueba, no los datos en si mismos.
Tener múltiples divisiones puede ser útil si desea estimar mejor el rendimiento de su modelo.
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2,random_state=0)
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=3,algorithm='auto',metric='euclidean',weights='uniform')
#Recuerden: Si quiseramos seleccionar una métrica particular para calcular las distancias incluiriamos todo lo que se observa más alla de señalar el número de K. En caso contrario dejamos el scrit hasta el tamaño de K.
knn.fit(X_train, y_train)
KNeighborsClassifier(metric='euclidean', n_neighbors=3)
y_pred = knn.predict(X_test)
print('Precisión en el set de Entrenamiento: {:.2f}'
.format(knn.score(X_train, y_train)))
print('Precisión en el set de Test: {:.2f}'
.format(knn.score(X_test, y_test)))
Precisión en el set de Entrenamiento: 0.97 Precisión en el set de Test: 0.94
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, jaccard_score
print('F-1 Score : ',(f1_score(y_test, y_pred, average='micro')))
print('Precision Score : ',(precision_score(y_test, y_pred, average='micro')))
print('Recall Score : ',(recall_score(y_test, y_pred, average='micro')))
print('Jaccard Score : ',(jaccard_score(y_test, y_pred, average='micro')))
F-1 Score : 0.9439694195754954 Precision Score : 0.9439694195754954 Recall Score : 0.9439694195754954 Jaccard Score : 0.8938845494379882
from sklearn.metrics import classification_report, confusion_matrix, roc_curve
print (classification_report(y_test, y_pred))
precision recall f1-score support
0 1.00 0.91 0.95 5928
1 0.88 1.00 0.93 4013
accuracy 0.94 9941
macro avg 0.94 0.95 0.94 9941
weighted avg 0.95 0.94 0.94 9941
# Matriz de confusión
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm_matrix = pd.DataFrame(data=cm, columns=['Actual Positive:1', 'Actual Negative:0'],
index=['Predict Positive:1', 'Predict Negative:0'])
sns.heatmap(cm_matrix, annot=True, fmt='d', cmap='Blues')
plt.figure(figsize=(9,9))
<Figure size 900x900 with 0 Axes>
<Figure size 900x900 with 0 Axes>
La curva ROC (Receiver Operating Characteristic) se utiliza para evaluar el rendimiento de los algoritmos de clasificación binaria.
# sacar las probabilidades
y_pred_prob = knn.predict_proba(X_test) [:,1]
fpr, tpr, thresholds =roc_curve(y_test, y_pred_prob)
import matplotlib.pyplot as plt
plt.figure(figsize= (10, 7))
# Agregar ROC
plt.plot(fpr, tpr, color="red", lw=2, label="Roc curve")
## Random FPR y TPR
plt.plot([0,1], [0,1], color="blue", lw=2, linestyle="--")
## Titulo y etiquetas
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("Roc curve")
plt.show()
vecinos = np.arange(1, 20)
vecinos
array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
18, 19])
Crear matrices (vacias)
train_ex =np.empty(len(vecinos))
test_ex =np.empty(len(vecinos))
Crear "Loop" para valores de K
for i, k in enumerate(vecinos):
knn = KNeighborsClassifier (n_neighbors = k)
knn.fit(X_train, y_train)
train_ex[i] = knn.score(X_train, y_train)
test_ex[i] = knn.score(X_test, y_test)
train_ex
array([1. , 0.99967305, 0.96597168, 0.96499082, 0.94680717,
0.94539876, 0.93139006, 0.93008224, 0.92047484, 0.91707955,
0.91066623, 0.90875481, 0.90374991, 0.90083247, 0.89796534,
0.89668268, 0.89472096, 0.89298559, 0.89190413])
plt.title("vecinos proximos KNN")
plt.plot(vecinos, test_ex, label = "exactitud en test")
plt.plot(vecinos, train_ex, label = "exactitud en train")
plt.legend()
plt.xlabel("numero de vecinos")
plt.ylabel("exactitud")
plt.show()
X_new = np.array([[25, 8, 2, 7, 1, 2, 0, 0, 3, 3, 371, 1, 999, 0, 1, -2.9, 92.469, -33.6, 1.044, 5076.2],
[37, 10, 2, 2, 1, 2, 2, 0, 3, 4, 633, 1, 999, 0, 1, 1.4, 93.918, -42.7, 4.963, 5228.1]])
predection1 = knn.predict(X_new)
predection1
array([1, 1])
df_clean.to_csv("entrega_banking_KNN.csv", index = False)
df_clean.to_csv
<bound method NDFrame.to_csv of age job marital education default housing loan contact month \
37017 25 8 2 7 1 2 0 0 3
36682 51 9 2 6 0 0 0 0 4
29384 45 7 2 7 0 0 0 1 0
21998 29 9 2 3 1 0 0 0 1
16451 37 10 2 2 1 2 2 0 3
... ... ... ... ... ... ... ... ... ...
41177 57 5 1 5 0 2 0 0 7
41179 64 5 0 5 0 2 0 0 7
41180 36 0 1 6 0 0 0 0 7
41184 46 1 1 5 0 0 0 0 7
41185 56 5 1 6 0 2 0 0 7
day_of_week duration campaign pdays previous poutcome \
37017 3 371 1 999 0 1
36682 0 657 1 999 0 1
29384 0 541 1 999 0 1
21998 4 921 3 999 0 1
16451 4 633 1 999 0 1
... ... ... ... ... ... ...
41177 2 124 6 999 0 1
41179 0 151 3 999 0 1
41180 0 254 2 999 0 1
41184 0 383 1 999 0 1
41185 0 189 2 999 0 1
emp.var.rate cons.price.idx cons.conf.idx euribor3m nr.employed y
37017 -2.9 92.469 -33.6 1.044 5076.2 1
36682 -2.9 92.963 -40.8 1.268 5076.2 1
29384 -1.8 93.075 -47.1 1.405 5099.1 1
21998 1.4 93.444 -36.1 4.964 5228.1 1
16451 1.4 93.918 -42.7 4.963 5228.1 1
... ... ... ... ... ... ..
41177 -1.1 94.767 -50.8 1.031 4963.6 0
41179 -1.1 94.767 -50.8 1.028 4963.6 0
41180 -1.1 94.767 -50.8 1.028 4963.6 0
41184 -1.1 94.767 -50.8 1.028 4963.6 0
41185 -1.1 94.767 -50.8 1.028 4963.6 0
[49702 rows x 21 columns]>